library(tidyverse)
library(lmtest) # robust SEs
library(sandwich)
library(countrycode) # for matching country names
library(dataverse) # for accessing replication data on the dataverse
library(WDI) # World Developement indicators API
library(ggdist) # Distribution plots
library(marginaleffects) # marginal effects estimation tools
library(modelsummary)
library(performance)Class 03
Packages
Packages (there’s a lot!)
Also, if you don’t have it already, you’ll want to install the vdemdata package. This provides easy access to the latest version of the Varieties of Democracy Dataset :
devtools::install_github("vdeminstitute/vdemdata")Preparing the data
First, we need to do a little data set construction. This is messy, but rather than leave it out I figured I would show it here to give a sense of the process.
We’ll pull from three difference data sources:
Varieties of Democracy (V-DEM) for global data on democracy and governance (codebook)
World Bank Data world (information) (R package info) for economic indicators
Global Legislators Data (dataverse) (codebook) for information about the ages of legislators
# get global legislators data
gld <- get_dataframe_by_name(
filename = "global_legislator_database.tab",
dataset = "10.7910/DVN/U1ZNVT",
server = "dataverse.harvard.edu")
# aggregate to get average age for each legislature
gld_agg <- gld |>
group_by(country_name, year_of_election) |>
summarise(
nleg = n(),
mean_age = mean(year_of_election - year(dob), na.rm = T)
)|>
# add VDEM matching code
mutate(vdem_code = countrycode(country_name, origin = 'country.name', destination = 'vdem'))
# get vdem data. Filter by year and select usefil columns
vdem<-vdemdata::vdem|>
filter(year>=2010)|>
select(year, country_id, e_pelifeex, v2elpubfin,v2xnp_client,e_regionpol, v2elparlel, v2x_corr, v2x_polyarchy)|>
mutate(electoral_system = factor(v2elparlel, labels=c("Majoritarian", "Proportional", "Mixed","Other")),
)|>
rename(public_finance = v2elpubfin)
# get data from the world bank WDI
wdi_data<-WDI(indicator=c('gdp_pcap'= 'NY.GDP.PCAP.KD',
'gini' = 'SI.POV.GINI',
'life_expectancy' = 'SP.DYN.LE00.IN',
'pop_over_65'= 'SP.POP.65UP.TO.ZS'),
start=2010, end=2023
)
# Add a vdem matching code
wdi_data<-wdi_data|>
mutate(vdem_code = countrycode(country, origin='country.name', destination='vdem'))
# Join everything on country and year. inner_join means we're dropping anything
# without a match, be careful with this (left joins are often better)
gld_joined<-gld_agg|>
inner_join(vdem, by=join_by(vdem_code == country_id, year_of_election == year))|>
inner_join(wdi_data, by = join_by(vdem_code == vdem_code, year_of_election == year) )Research Question
The U.S. has a lot of old people in congress. Why is this? How do we compare to other states? And are their structural features that might explain why we’ve got so many people born before the invention of commercially viable color television in positions of power?
gld|>
filter(country_name=="United States")|>
ggplot(aes(x=dob)) +
geom_density(fill='lightblue', alpha=.8) +
geom_vline(xintercept = as.Date("1954-12-17"), lty=2) +
annotate("text", x=as.Date("1957-12-31") + 1500, y=.000002, label="invention of color tv") +
theme_bw() +
xlab("Birthdates of members of Congress") +
ylab("")Model output
We’ll run three slightly different models and present them in a single table.
gof_stats <- c(
"nobs", "aic", "bic", "r.squared"
)
coef_names <- c("life_expectancy" = "Life expectancy",
"gdp_pcap" = "GDP per-capita",
"log(gdp_pcap)" = "Logged GDP per-capita",
"public_finance" = "Public financing of elections"
)
model_0<-lm(mean_age ~ life_expectancy + gdp_pcap,
data=gld_joined)
model_1<-lm(mean_age ~ life_expectancy + log(gdp_pcap), data =gld_joined)
model_2<-lm(mean_age ~ life_expectancy + log(gdp_pcap) + public_finance , data =gld_joined)
mlist<-list(
'Baseline model' = model_0,
'Logged GDP model' = model_1,
'Public financing' = model_2
)
modelsummary(mlist,
gof_map = gof_stats,
coef_rename = coef_names,
coef_omit = "Intercept"
)| Baseline model | Logged GDP model | Public financing | |
|---|---|---|---|
| Life expectancy | -0.114 | -0.241 | -0.177 |
| (0.063) | (0.090) | (0.087) | |
| GDP per-capita | 0.000 | ||
| (0.000) | |||
| Logged GDP per-capita | 1.156 | 1.474 | |
| (0.537) | (0.516) | ||
| Public financing of elections | -0.905 | ||
| (0.262) | |||
| Num.Obs. | 94 | 94 | 94 |
| AIC | 515.6 | 511.7 | 502.0 |
| BIC | 525.7 | 521.9 | 514.7 |
| R2 | 0.036 | 0.074 | 0.183 |
Checks
At a minimum, we should look for evidence of:
non-normality
heteroskedasticity
influential observations
non-linearity
The check_model function gives us a good starting point for identifying these issues:
check_model(model_0)The Breusch–Pagan test gives us a way to test for “significant” heteroskedasticity by running a new linear model with the squared residuals as our dependent variable.
The null hypothesis here is homoskedastic data, and the alternative is heteroskedasticity:
squared_resid<-resid(model_0)^2
resid_model<-lm(squared_resid ~ life_expectancy + gdp_pcap, data = gld_joined)
stat<-nrow(resid_model$model) * summary(resid_model)$r.squared
1-pchisq(stat, df=2)[1] 0.04699436
The performance package has a built-in function for running a variation of the Breusch-Pagan test which we can run with check_heteroskedasticity
check_heteroskedasticity(model_0)Warning: Heteroscedasticity (non-constant error variance) detected (p = 0.008).
We might also want to check for evidence of multicollinearity, although this appears to be less of a problem here:
check_collinearity(model_0)And checks for individual outliers (defined as cook’s distance greater than 0.8)
check_outliers(model_0)OK: No outliers detected.
- Based on the following method and threshold: cook (0.8).
- For variable: (Whole model)
Based on the analyses above, do you see evidence to support any changes to the model such as transforming variables?
Robust Standard Errors
Since we have some clear evidence of heteroskedasticity, it probably makes sense to go ahead and use heteroskedasticity robust standard errors, which relax the assumption of constant variance.
We can get heteroskedasticity consistent standard errors using the coeftest function from the lmtest package.
library(lmtest)
m_0<-coeftest(model_0, vcov = vcovHC(model_0, "HC0"))
modelsummary(list("classic" = model_0, "robust" = m_0),
gof_map = gof_stats,
coef_rename = coef_names,
coef_omit = "Intercept")| classic | robust | |
|---|---|---|
| Life expectancy | -0.114 | -0.114 |
| (0.063) | (0.068) | |
| GDP per-capita | 0.000 | 0.000 |
| (0.000) | (0.000) | |
| Num.Obs. | 94 | 94 |
| AIC | 515.6 | 689.6 |
| BIC | 525.7 | 921.0 |
| R2 | 0.036 |
The model summary package will also apply these changes directly to a list models, so I can try out different methods of calculating the standard errors and p-values for the same model:
modelsummary(model_0,
gof_map = gof_stats,
coef_rename = coef_names,
vcov = c("classical", "HC0", "bootstrap")
)| (1) | (2) | (3) | |
|---|---|---|---|
| (Intercept) | 57.855 | 57.855 | 57.855 |
| (4.430) | (4.943) | (5.165) | |
| Life expectancy | -0.114 | -0.114 | -0.114 |
| (0.063) | (0.068) | (0.071) | |
| GDP per-capita | 0.000 | 0.000 | 0.000 |
| (0.000) | (0.000) | (0.000) | |
| Num.Obs. | 94 | 94 | 94 |
| AIC | 515.6 | 515.6 | 515.6 |
| BIC | 525.7 | 525.7 | 525.7 |
| R2 | 0.036 | 0.036 | 0.036 |
Model Comparison
Which models are better? We can compare models using BIC/AIC (lower scores mean better models, all else equal)
compare_performance(list(model_0, model_1, model_2))Or, if the models are nested, we can compare using F-tests
Equivalence Testing
We can see that the effect of electoral_system is not statistically significant, but what if we want to rule out the possibility that the a PR system would result in younger legislators relative to a majoritarian system?
model<-lm(mean_age ~
life_expectancy +
public_finance +
electoral_system, data=gld_joined)
model_robust<-coeftest(model, vcov = vcovHC(model, "HC0"))
model_robust
t test of coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 49.048664 5.954992 8.2366 1.419e-12 ***
life_expectancy 0.028756 0.086836 0.3311 0.7413
public_finance -0.664078 0.327375 -2.0285 0.0455 *
electoral_systemProportional -1.089526 1.191739 -0.9142 0.3631
electoral_systemMixed -0.771819 1.486539 -0.5192 0.6049
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
We’ll set a minimum effect size of 0.66112. Essentially, we are attempting to rule out the possibility that the effect of proportional representation (relative to majoritarianism) is greater than the effect of public financing for elections:
min_effect <- 0.66112
broom::tidy(model_robust, conf.int = TRUE, conf.level = .90)|>
filter(term == 'electoral_systemProportional')|>
select(term, estimate, conf.low, conf.high)|>
mutate(equi_lb = min_effect*-1,
equi_ub = min_effect)|>
ggplot(aes(y=term)) + geom_pointrange(aes(xmin=conf.low, xmax=conf.high, x=estimate)) +
geom_vline(aes(xintercept=equi_lb), color='red', lty=2) +
geom_vline(aes(xintercept=equi_ub), color='red', lty=2) +
theme_bw()An alternative to using heteroskedasticity consistent standard errors is to use the non-parametric bootstrap. This also allows us to generate a useful visualization of the cases where we might see effects above the minimum threshold we set above:
comp <- avg_comparisons(model,
variables = list("electoral_system" = c("Majoritarian", "Proportional")),
vcov = 'HC0')
booted<-inferences(comp, method='boot', conf_type = 'perc', R=1000)
draws<-get_draws(booted)
ggplot(draws, aes(x=draw, y='Simulated Differences'))+
stat_halfeye(alpha=.5, .width = c(.90), aes(fill=after_stat(x>=min_effect | x<= -min_effect))) +
geom_vline(xintercept=-min_effect, lty=2, col='black') +
geom_vline(xintercept=min_effect, lty=2, col='black') +
theme_bw() +
scale_fill_brewer(palette='Dark2')+
xlab("Simulated Differences") +
ylab("")+
coord_flip()Based on these results, can we rule out an effect for proportional representation?